import pandas as pd
import numpy as np
import pydotplus
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
%matplotlib inline
from scipy.stats import zscore
from io import StringIO
from efficient_apriori import apriori
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import preprocessing as spp
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,roc_curve,auc
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from IPython.display import Image
print("libraries imported!")
dataset = pd.read_csv('HR_comma_sep.csv',
sep= ',')
data = dataset.copy(deep=True)
data.head(10)
data.info()
#The column "department" is named as "departmennt". so lets rename it to "department".
data.rename(columns={'departmennt': 'department'}, inplace=True)
#Salary has two different names for same category like "low" and "Low", making them same
def salary(row):
if row['salary'] == 'High' or row['salary'] == 'high':
return 'high'
elif row['salary'] == 'Medium' or row['salary'] == 'medium':
return 'medium'
else:
return 'low'
data['salary'] = data.apply(salary,axis=1)
data.head()
data.tail()
data['satisfaction_level'] = data['satisfaction_level'].fillna(data['satisfaction_level'].mode()[0])
data['last_evaluation'] = data['last_evaluation'].fillna(data['last_evaluation'].mode()[0])
data['number_project'] = data['number_project'].fillna(data['number_project'].mode()[0])
data['average_montly_hours'] = data['average_montly_hours'].fillna(int(data['average_montly_hours'].mode()[0]))
data['time_spend_company'] = data['time_spend_company'].fillna(data['time_spend_company'].mode()[0])
data['Work_accident'] = data['Work_accident'].bfill()
data['promotion_last_5years'] = data['promotion_last_5years'].bfill()
data['department'] = data['department'].bfill()
data['salary'] = data['salary'].bfill()
data['left'] = data['left'].bfill()
#In the above table, the salary column contains strings(high, medium, low).
#We can add another column('salary_numerical') to the table that will return a number if salary is high(3), medium(2) or low(1).
#and check the describe command.
def salary(row):
if row['salary'] == 'high':
return 3
elif row['salary'] == 'medium':
return 2
else:
return 1
data['salary_numerical'] = data.apply(salary, axis=1)
data.describe()
#normalizig for better visualization
normalized_average_montly_hours = data['average_montly_hours']/data['average_montly_hours'].mean()
normalized_average_montly_hours = data['average_montly_hours']/data['average_montly_hours'].mean()
normalized_time_spend_company = data['time_spend_company']/data['time_spend_company'].mean()
normalized_number_project = data['number_project']/data['number_project'].mean()
fig, ax = plt.subplots()
ax.boxplot((data['satisfaction_level'],data['last_evaluation'],normalized_number_project,normalized_average_montly_hours,normalized_time_spend_company,data['Work_accident'],data['promotion_last_5years'],data['left']), vert=False, showmeans=True, meanline=True,
labels=('satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','left'), patch_artist=True,
medianprops={'linewidth': 2, 'color': 'purple'},
meanprops={'linewidth': 2, 'color': 'red'})
plt.show()
sns.set(style="white")
corr = data.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
vf, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, mask=mask, vmax=.3, center=0,annot=True,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
This correlation matrix tells us that the higher someone is satisfied in their job the less likely they are to leave.
Time spent with the company is positively correlated with leaving,
suggesting the longer someone works for this organization the more likely they are too leave.
Another relationship that makes sense is that a higher salary appears to be negatively associated with leaving.
One correlation in the above matrix doesn't quite make sense.
Having had a work accident is negatively related to leaving the company, which doesn't really make intuitive sense.On the surface this doesn't make a whole lot of sense because you would assume that an employee felt unsafe at an organization that they might want to leave.
left = data.groupby('left')
left.mean()
Here you can interpret, Employees who left the company had low satisfaction level, low promotion rate, low salary, and worked more compare to who stayed in the company.
left_count=data.groupby('left').count()
plt.bar(left_count.index.values, left_count['satisfaction_level'])
plt.xlabel('Employees Left Company')
plt.ylabel('Number of Employees')
plt.show()
data.left.value_counts()
Here, you can see out of 15,000 approx 3,571 were left, and 11,428 stayed. The no of employee left is 23 % of the total employment.
num_projects=data.groupby('number_project').count()
plt.bar(num_projects.index.values, num_projects['satisfaction_level'])
plt.xlabel('Number of Projects')
plt.ylabel('Number of Employees')
plt.show()
Most of the employee is doing the project from 3-5
time_spent=data.groupby('time_spend_company').count()
plt.bar(time_spent.index.values, time_spent['satisfaction_level'])
plt.xlabel('Number of Years Spend in Company')
plt.ylabel('Number of Employees')
plt.show()
Most of the employee experience between 2-4 years. Also, there is a massive gap between 3 years and 4 years experienced employee.
features=['number_project','time_spend_company','Work_accident','left', 'promotion_last_5years','department','salary']
fig=plt.subplots(figsize=(10,15))
for i, j in enumerate(features):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data)
plt.xticks(rotation=90)
plt.title("No. of employee")
You can observe the following points in the above visualization:
1 Most of the employee is doing the project from 3-5.
2 There is a huge drop between 3 years and 4 years experienced employee.
3 The no of employee left is 23 % of the total employment.
4 A decidedly less number of employee get the promotion in the last 5 year.
5 The sales department is having maximum no.of employee followed by technical and support
6 Most of the employees are getting salary either medium or low.
fig=plt.subplots(figsize=(10,15))
for i, j in enumerate(features):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data, hue='left')
plt.xticks(rotation=90)
plt.title("No. of employee")
You can observe the following points in the above visualization:
1 Those employees who have the number of projects more than 5 were left the company.
2 The employee who had done 6 and 7 projects, left the company it seems to like that they were overloaded with work.
3 The employee with five-year experience is leaving more because of no promotions in last 5 years and more than 6 years experience are not leaving because of affection with the company.
4 Those who promotion in last 5 years they didn't leave, i.e., all those left they didn't get the promotion in the previous 5 years.
Following features are most influencing a person to leave the company:
Promotions: Employees are far more likely to quit their job if they haven't received a promotion in the last 5 years.
Time with Company: Here, The three-year mark looks like a time to be a crucial point in an employee's career. Most of them quit their job around the three-year mark. Another important point is 6-years point, where the employee is very unlikely to leave.
Number Of Projects: Employee engagement is another critical factor to influence the employee to leave the company. Employees with 3-5 projects are less likely to leave the company. The employee with less and more number of projects are likely to leave.
Salary: Most of the employees that quit among the mid or low salary groups.
data_department = data.groupby(by='department',as_index=False).mean()
#Pure Count
data_department.plot(kind = "bar", x ='department', y='left', color = 'red')
#Creating a new column and assigning qualitative rankings for three(3) Satisfaction levels(High, Medium and Low).
#and adding a new column "Satisfaction" to hold the qualitative rankings.
def satisfaction(row):
if row['satisfaction_level'] >= .5:
return 'medium'
elif .3 <= row['satisfaction_level'] < .5:
return 'high'
else:
return 'low'
data['qualitative_satisfaction'] = data.apply(satisfaction, axis=1)
data_satisfaction = data.groupby(by='qualitative_satisfaction',as_index=False).mean()
data_satisfaction.head()
#Pure Count
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='left', color = 'red')
#Percentage or Rate.
data_satisfaction = data.groupby(by=['qualitative_satisfaction'])
left_rate_satisfaction = data_satisfaction['left'].sum() / data_satisfaction['left'].count()
ax = left_rate_satisfaction.plot(kind='barh')
It is interesting. Those with high satisfaction levels are more likely to leave based on percentage of those who left, which doesn't make intuitive sense
if we use our three-category breakdown. This chart suggests that those who have medium satisfaction are the least likely to leave.
#befor that let's group data by salary
data_salary = data.groupby(by=['salary'],as_index=False).count()
data_salary
#Pure Count
data_salary.plot(kind = "bar", x ='salary', y='left', color = 'red')
#Percentage or Rate.
data_salary = data.groupby(by=['salary'])
left_rate_salary = data_salary['left'].sum() / data_salary['left'].count()
ax = left_rate_salary.plot(kind='barh')
# Set a default value
data_salary_satisfaction = data.groupby(by=['salary','qualitative_satisfaction'])
# data_salary_satisfaction.head()
left_rate_sal_sat = data_salary_satisfaction['left'].sum() / data_salary_satisfaction['left'].count()
ax = left_rate_sal_sat.plot(kind='barh', color='purple')
ax = sns.distplot(data['satisfaction_level'])
Satisfaction is not normally distributed.
data_satisfaction = data.groupby(by='qualitative_satisfaction',as_index=False).mean()
# Satisfaction & Project Numbers
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='number_project', color = 'blue')
# Satisfaction & average_montly_hours
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='average_montly_hours', color = 'blue')
# Satisfaction & time_spend_company
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='time_spend_company', color = 'blue')
# Satisfaction & Work_accident
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='Work_accident', color = 'blue')
# Satisfaction & promotion_last_5years
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='promotion_last_5years', color = 'blue')
# Satisfaction & salary_numerical
data_satisfaction.plot(kind = "bar", x ='qualitative_satisfaction', y='salary_numerical', color = 'blue')
This shows people with lowest number of projects and lowest number of time spend time monthly have highest satisfaction and people with most number of projects and highest monthly hours tends to loose internal peace.
Total time spent in company does't seem to have much effect on satisfactin, but people who spend 4 or more years tends to have low satisfactio, i.e passing time effects thier satisfaction.
People who have most number of work accidents have medium satisfaction, but less number od work accidents doesn't promise more satisfaction
Similarly promotion in last five years doesn't have linear effect on satisfaction.
Salary doesn't seem to have any effect on satisfaction, as people with high, medium or low satisfaction have same average salary. This is quite strange as because salary is believed to have a major effect on employees satisfaction.
#data processing
dep_sal_backup = data[['department','salary']]
le = LabelEncoder()
k = le.fit_transform(data['department'])
data['department_numerical'] = k
data.drop([ 'salary', 'department', 'qualitative_satisfaction'], axis=1, inplace=True)
data.head()
X = data.to_numpy()
# Normalized numpy array, used later in the visualization step
distortions = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k, random_state = 0)
kmeanModel.fit(X)
distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(8,6))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal n_clusters')
plt.show()
Elbow method suggests optimal number of clusters is 3
# Initialize
kmeans = KMeans(n_clusters=3, random_state = 0)
# Fit
kmeans.fit(X)
# Print labels
print(kmeans.labels_)
kmeans_data = data.copy(deep = True)
kmeans_data['labels'] = kmeans.labels_
kmeans_data.head()
kmeans_columns = kmeans_data.columns
print(type(kmeans_columns))
# remove labels
kmeans_columns = kmeans_columns.delete(10)
# print(kmeans_columns)
for col in kmeans_columns:
data_age = kmeans_data.groupby(by='labels',as_index=False).mean()
data_age.plot(kind = "bar", x ='labels', y=col, color = 'red')
All three clusters have almost equal salary, department distribution, time spend company
Outlier detection can be done on satisfaction_level, last_evaluation, number_project, average_montly_hours, time_spend_company because the rest of the attributes are boolean or categorical
outlier_data = data.drop([ 'Work_accident', 'promotion_last_5years', 'left', 'salary_numerical', 'department_numerical'], axis='columns')
outlier_data.head()
fig, ax = plt.subplots()
ax.boxplot((outlier_data['satisfaction_level'], outlier_data['last_evaluation']), vert=False, showmeans=True, meanline=True,
labels=('satisfaction_level','last_evaluation'), patch_artist=True,
medianprops={'linewidth': 4, 'color': 'purple'},
meanprops={'linewidth': 4, 'color': 'red'})
plt.show()
norm_average_montly_hours = outlier_data['average_montly_hours']/outlier_data['average_montly_hours'].mean()
norm_number_project = outlier_data['number_project']/outlier_data['number_project'].mean()
norm_time_spend_company = outlier_data['time_spend_company']/outlier_data['time_spend_company'].mean()
fig, ax = plt.subplots()
ax.boxplot((norm_number_project,norm_time_spend_company ,norm_average_montly_hours ), vert=False, showmeans=True, meanline=True,
labels=('number_project','norm_time_spend_company','average_monthly_hours'), patch_artist=True,
medianprops={'linewidth': 4, 'color': 'purple'},
meanprops={'linewidth': 4, 'color': 'red'})
plt.show()
zscore_data = data.drop([ 'Work_accident', 'promotion_last_5years', 'left', 'salary_numerical', 'department_numerical'], axis='columns')
zscore_data.head()
score = np.abs(zscore(zscore_data))
threshold = 3
zoutliers = np.where(score > threshold)
# type(outliers)
zoutliers_in_dataset = pd.DataFrame(zoutliers).transpose()
zoutliers_in_dataset
Each row refrences to a particular datapoint in zscore_data. The first element contains the row number and second element has column number, which mean zscore_data[11007][4] have a Z-score higher than 3.
iqr_data = data.drop([ 'Work_accident', 'promotion_last_5years', 'left', 'salary_numerical', 'department_numerical'], axis='columns')
iqr_data.head()
for col in iqr_data.columns:
Q1, Q3 = np.percentile(iqr_data[col], 25), np.percentile(iqr_data[col], 75)
IQR, k = (Q3 - Q1), 3
lower_Outlier, higher_Outlier = Q1 - (k * IQR), Q3 + (k * IQR)
# identify outliers
iqr_outliers = [j for j in iqr_data[col] if j < lower_Outlier or j > higher_Outlier]
print("\n" + col + "\n")
print(iqr_outliers)
X=data.drop(['left'],axis=1)
Y=data['left']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,)
features = list(X_test.columns)
X_test
Y_test
lr = LogisticRegression(solver = 'lbfgs', max_iter = 400)
lr.fit(X_train, Y_train)
predict = lr.predict(X_train)
print("Logistic Regression Training Score is",lr.score(X_train,Y_train))
print("Logistic Regression Testing Score is",lr.score(X_test,Y_test))
accuracy_lr = lr.score(X_test,Y_test)*100
print("Accuracy of Logistic Regression is ",accuracy_lr)
for col,value in zip(data[features].columns,lr.coef_[0]):
print(col,"*",value,"+")
print(lr.intercept_[0])
print("Confusion Matrix for logistic Regression")
print(confusion_matrix(Y_test, lr.predict(X_test)))
print("Classification Report for Logistic Regression")
print(classification_report(Y_test, lr.predict(X_test)))
As observed our Logistic Regression has an accuracy of approx 78% for its predictions and on first glance this might seem to be a pretty good performing model. But,this model has a lower Specificity.
# Compute ROC curve and ROC area for each class
y_score = lr.decision_function(data[features])
n_classes =2
fpr = dict()
tpr = dict()
roc_auc = dict()
fpr, tpr, _ = roc_curve(data['left'], y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw,\
label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
Findings:
dt=DecisionTreeClassifier(criterion = 'entropy')
dt.fit(X_train,Y_train)
prediction_dt=dt.predict(X_test)
accuracy_dt=accuracy_score(Y_test,prediction_dt)*100
print('Accuracy = ' + str(accuracy_dt))
prediction_dt
Y_test
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(Y_test, prediction_dt))
print("Classification Report for Decision Tree")
print(classification_report(Y_test, prediction_dt))
#Decision Tree Plot
export_graphviz(dt,feature_names=features, out_file="dtree.dot",
filled = True, rounded = True,
special_characters = True)
dot_data = StringIO()
export_graphviz(dt, feature_names = features, out_file=dot_data,
filled = True, rounded = True,
special_characters= True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
feature_importance=pd.DataFrame(dt.feature_importances_,index=X_train.columns,columns=['Importance']).sort_values('Importance',ascending=False)
feature_importance
#checkin our model for anyother data
catagory = ['Employe will stay', 'Employ will leave']
custom_dt = [[10,200,4,0,0, 0.80, 0.96, 2, 4]]
catagory[int(dt.predict(custom_dt))]
sc=StandardScaler().fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)
X_train_std
k_range=range(1,26)
scores={}
scores_list=[]
for k in k_range:
knn=KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_std,Y_train)
prediction_knn=knn.predict(X_test_std)
scores[k]=accuracy_score(Y_test,prediction_knn)*100
scores_list.append(accuracy_score(Y_test,prediction_knn))
scores
plt.plot(k_range,scores_list)
#using n_neighbors = 1, as it has the highest accuracy
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_std,Y_train)
prediction_knn=knn.predict(X_test_std)
accuracy_knn=accuracy_score(Y_test,prediction_knn)*100
print('Accuracy = ' + str(accuracy_knn))
prediction_knn
Y_test
#checkin our model for anyother data
X_knn=np.array([[20,500,10,6,0,0.10,0.30,1,8]])
X_knn_std=sc.transform(X_knn)
X_knn_prediction=knn.predict(X_knn_std)
catagory[X_knn_prediction[0]]
rf = RandomForestClassifier(n_estimators=35,criterion="entropy")
rf.fit(X_train,Y_train)
pred_rf = rf.predict(X_test)
print("Training accuracy for Random Forest is",rf.score(X_train,Y_train)*100)
print("Testing accuracy for Random Forest is",rf.score(X_test,Y_test)*100)
accuracy_rf = rf.score(X_test,Y_test)*100
print("Confusion Matrix of Random Forest \n",confusion_matrix(Y_test,pred_rf))
print("Classification report for Random Forest")
print(classification_report(Y_test,pred_rf))
feature_importances = pd.DataFrame(rf.feature_importances_,
index = features,
columns=['importance']).sort_values('importance',ascending=False)
feature_importances
Clearly some features like satisfaction_level, time_spend_company has a higher impact on the retention of the employees. People having a higher satisfaction are more likely to stay in the organization.
algorithms=['Logi. Regression', 'Decision Tree','KNN', 'Random Forest']
scores=[accuracy_lr, accuracy_dt,accuracy_knn, accuracy_rf]
plt.xlabel("Algorithms")
plt.ylabel("Accuracy Score")
sns.barplot(algorithms,scores)
plt.show()
def data_generator(filename):
"""
Data generator, needs to return a generator to be called several times.
Use this approach if data is too large to fit in memory. If not use a list.
"""
def data_gen():
with open(filename) as file:
for line in file:
yield tuple(k.strip() for k in line.split(','))
return data_gen
transactions = data_generator('HR_comma_sep.csv')
itemsets, rules = apriori(transactions, min_support=0.3, min_confidence=0.3)
rules
itemsets
pca = PCA(n_components = 2)
pca.fit(data)
pca.explained_variance_ratio_
pca.explained_variance_
pca.get_covariance
pca.mean_
pca.score_samples(data)
pca.singular_values_
pca.get_precision
Principal Component Analysis is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components.
It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. In simple terms, a Naive Bayes classifier assumes that the presence of particular feature in a class is unrelated to the presence of any other feature. The model was trained using the training datasets generated previously ie., X_train and Y_train with an accuracy of 79.89% (approximately) while accuracy at testing was 80.13% (approximately). Guassian is used in classification and assumes that feature follow a normal distribution due to which the accuracy is not at par with other methods. It can be observed from the figure that 2111 points were mislabeled out of 104099 which makes up 14.07% (approximately) of the dataset.
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
pred_gnb = gnb.predict(X_train)
gnb.score(X_train, Y_train)
accuracy_gnb = gnb.score(X_test, Y_test)
accuracy_gnb
gnb.predict_proba(X_test)
print("Number of mislabeled points out of total %d points : %d" %(X_train.shape[0], (Y_train != pred_gnb).sum()))
print("Accuracy = ", accuracy_gnb*100)